Notes: This document summarizes only the key outputs of the final multiple imputation process for SPHC-B 2021. More detailed R scripts are available in the R Notebook “SPHC-B-2021.Rmd”.


1. Load Packages

library(naniar)
library(dplyr)
library(jomo)
library(mitml)
library(ggplot2)

2. SPHC-B 2021

2.1. Incomplete dataset

load("/Volumes/LGBT Project data/Multiple Imputation/d_2021_incomplete.RData")
summary( d_2021_incomplete )
 sampling_strata_region calibrated_weight no.of.population        sexual_identity_2021      age             sex       
 Täby       :  685      Min.   :  6.221   Min.   :  7470   Heterosexual     :20074     Min.   : 16.00   Male  :10648  
 Danderyd   :  673      1st Qu.: 42.552   1st Qu.: 28286   Homosexual       :  366     1st Qu.: 38.00   Female:12418  
 Skarpnäck  :  665      Median : 69.147   Median : 46076   Bisexual         :  584     Median : 53.00                 
 Värmdö     :  662      Mean   : 79.226   Mean   : 48535   None of the above: 1534     Mean   : 52.19                 
 Kungsholmen:  662      3rd Qu.:106.412   3rd Qu.: 63834   NA's             :  508     3rd Qu.: 67.00                 
 Södermalm  :  662      Max.   :370.732   Max.   :106702                               Max.   :100.00                 
 (Other)    :19057                                                                                                    
       country_of_birth       education         income                 marital_status  weight_strata  
 Sweden        :18101   <=9 years  : 2771   Min.   :     0   Never married    : 8130   20     :  962  
 Europe        : 2410   10-12 years: 7948   1st Qu.:  2389   Currently married:10817   12     :  941  
 Outside Europe: 2555   >=13 years :11875   Median :  3324   Other            : 4119   18     :  937  
                        NA's       :  472   Mean   :  4115                             4      :  936  
                                            3rd Qu.:  4478                             2      :  934  
                                            Max.   :497851                             17     :  933  
                                            NA's   :40                                 (Other):17423  
sapply( d_2021_incomplete, class ) # all continuous variables are numeric, and all categorical variables are factor
sampling_strata_region      calibrated_weight       no.of.population   sexual_identity_2021                    age                    sex 
              "factor"              "numeric"              "numeric"               "factor"              "numeric"               "factor" 
      country_of_birth              education                 income         marital_status          weight_strata 
              "factor"               "factor"              "numeric"               "factor"               "factor" 
miss_var_summary( d_2021_incomplete ) # 2.2% missing in sexual_identity_2021, 2.1% in education, and 0.2% in income

2.2. Two-level multivariate normal imputation

# specify imputation model
# fml_imp_2021 <- sexual_identity_2021 + education + income ~ 1 + age*sex + country_of_birth + marital_status + ( 1 | weight_strata )

# final imputation with the chosen number of iterations
# imp_final_2021 <- jomoImpute( data = d_2021_incomplete,
#                               formula = fml_imp_2021,
#                               random.L1 = "full",
#                               n.burn = 2000,
#                               n.iter = 1000,
#                               m = 20,
#                               seed = 12345
#                               ) # took around 1 hour and a half

load("/Volumes/LGBT Project data/Multiple Imputation/imp_final_2021.RData")
summary( imp_final_2021 )

Call:

jomoImpute(data = d_2021_incomplete, formula = fml_imp_2021, 
    random.L1 = "full", n.burn = 2000, n.iter = 1000, m = 20, 
    seed = 12345)

Cluster variable:         weight_strata 
Target variables:         sexual_identity_2021 education income 
Fixed effect predictors:  (Intercept) age sex country_of_birth marital_status age:sex 
Random effect predictors: (Intercept) 

Performed 2000 burn-in iterations, and generated 20 imputed data sets,
each 1000 iterations apart. 

Potential scale reduction (Rhat, imputation phase):
 
         Min   25%  Mean Median   75%   Max
Beta:  1.000 1.001 1.006  1.002 1.009 1.033
Psi:   1.000 1.001 1.002  1.001 1.002 1.005
Sigma: 1.000 1.000 1.035  1.002 1.026 1.820

Largest potential scale reduction:
Beta: [1,6], Psi: [6,6], Sigma: [5,3]

Missing data per variable:
    weight_strata sexual_identity_2021 education income sampling_strata_region calibrated_weight no.of.population age sex country_of_birth
MD% 0             2.2                  2.0       0.2    0                      0                 0                0   0   0               
    marital_status
MD% 0             
plot( imp_final_2021, trace = "all", print = "beta" )

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

2.3. Validate imputed data

# extract imputed datasets
original_data_2021 <- mitmlComplete( imp_final_2021, print = 0 ) # extract original incomplete dataset
implist_2021 <- mitmlComplete( imp_final_2021, print = "all" ) # extract all imputed datasets

original_data_2021$imputation <- "0"
all_data_2021 <- bind_rows( original_data_2021,
                            bind_rows( implist_2021, .id = "imputation" ) ) # merge datasets
all_data_2021$imputation <- as.numeric( all_data_2021$imputation )
summary( all_data_2021 )
 weight_strata           sexual_identity_2021       education          income       sampling_strata_region calibrated_weight
 20     : 20202   Heterosexual     :429863    <=9 years  : 60506   Min.   :-15246   Täby       : 14385     Min.   :  6.221  
 12     : 19761   Homosexual       :  7869    10-12 years:169887   1st Qu.:  2389   Danderyd   : 14133     1st Qu.: 42.552  
 18     : 19677   Bisexual         : 12624    >=13 years :253521   Median :  3324   Skarpnäck  : 13965     Median : 69.147  
 4      : 19656   None of the above: 33522    NA's       :   472   Mean   :  4114   Värmdö     : 13902     Mean   : 79.226  
 2      : 19614   NA's             :   508                         3rd Qu.:  4480   Kungsholmen: 13902     3rd Qu.:106.423  
 17     : 19593                                                    Max.   :497851   Södermalm  : 13902     Max.   :370.732  
 (Other):365883                                                    NA's   :40       (Other)    :400197                      
 no.of.population      age             sex               country_of_birth            marital_status     imputation
 Min.   :  7470   Min.   : 16.00   Male  :223608   Sweden        :380121   Never married    :170730   Min.   : 0  
 1st Qu.: 28286   1st Qu.: 38.00   Female:260778   Europe        : 50610   Currently married:227157   1st Qu.: 5  
 Median : 46076   Median : 53.00                   Outside Europe: 53655   Other            : 86499   Median :10  
 Mean   : 48535   Mean   : 52.19                                                                      Mean   :10  
 3rd Qu.: 63834   3rd Qu.: 67.00                                                                      3rd Qu.:15  
 Max.   :106702   Max.   :100.00                                                                      Max.   :20  
                                                                                                                  
# sexual identity in 2021
ggplot( all_data_2021[ !is.na( all_data_2021$sexual_identity_2021 ), ],
        aes( fill = sexual_identity_2021, x = imputation ) ) + 
  geom_bar( position = "fill" ) + 
  scale_y_continuous( labels = scales::percent ) + 
  scale_fill_discrete( name = "Sexual identity in 2021" ) +
  labs(
    x = "Imputation number",
    y = "Proportion",
    caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
  theme_classic() +
  theme( axis.title.x = element_text( family = "Arial", size = 11 ),
         axis.text.x = element_text( family = "Arial", size = 11 ),
         axis.text.y = element_text( family = "Arial", size = 11 ),
         axis.title.y = element_text( family = "Arial", size = 11 ),
         legend.text = element_text( family = "Arial", size = 10 ),
         legend.title = element_text( family = "Arial", size = 10 ),
         legend.position = "bottom",
         plot.caption = element_text( family = "Arial", size = 10, hjust = 0 ) 
  )


# education
ggplot( all_data_2021[ !is.na( all_data_2021$education ), ],
        aes( fill = education, x = imputation ) ) + 
  geom_bar( position = "fill" ) + 
  scale_y_continuous( labels = scales::percent ) + 
  scale_fill_discrete( name = "Level of education" ) +
  labs(
    x = "Imputation number",
    y = "Proportion",
    caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
  theme_classic() +
  theme( axis.title.x = element_text( family = "Arial", size = 11 ),
         axis.text.x = element_text( family = "Arial", size = 11 ),
         axis.text.y = element_text( family = "Arial", size = 11 ),
         axis.title.y = element_text( family = "Arial", size = 11 ),
         legend.text = element_text( family = "Arial", size = 10 ),
         legend.title = element_text( family = "Arial", size = 10 ),
         legend.position = "bottom",
         plot.caption = element_text( family = "Arial", size = 10, hjust = 0 ) 
  )


# income
summary( all_data_2021$income )
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 -15246    2389    3324    4114    4480  497851      40 
nrow( all_data_2021[ all_data_2021$income < 0 & !is.na( all_data_2021$income ), ] ) #  194 imputed values are negative
[1] 194
LS0tCnRpdGxlOiAiVmFsaWRhdGlvbiBvZiBNdWx0aXBsZSBJbXB1dGF0aW9uIGluIFNQSEMtQiAyMDIxIgphdXRob3I6IFdpbGxpIFpoYW5nICh3aWxsaS56aGFuZ0BraS5zZSksIE1hdHRlbyBRdWFydGFnbm8Kb3V0cHV0OiBodG1sX25vdGVib29rCmVkaXRvcl9vcHRpb25zOgogIGNodW5rX291dHB1dF90eXBlOiBpbmxpbmUKLS0tCgo8YnI+CgojIyMjIyAqTm90ZXM6KiBUaGlzIGRvY3VtZW50IHN1bW1hcml6ZXMgb25seSB0aGUga2V5IG91dHB1dHMgb2YgdGhlIGZpbmFsIG11bHRpcGxlIGltcHV0YXRpb24gcHJvY2VzcyBmb3IgU1BIQy1CIDIwMjEuIE1vcmUgZGV0YWlsZWQgUiBzY3JpcHRzIGFyZSBhdmFpbGFibGUgaW4gdGhlIFIgTm90ZWJvb2sgWyJTUEhDLUItMjAyMS5SbWQiXShodHRwczovL2dpdGh1Yi5jb20vd2lsbGl6aGFuZy9UZW1wb3JhbC1UcmVuZHMtaW4tU2V4dWFsLUlkZW50aXR5LWFuZC1Tb2Npb2RlbW9ncmFwaGljLURpc3Bhcml0aWVzLWluLVN0b2NraG9sbS1Db3VudHktMjAxMC10by0yMDIxL2Jsb2IvbWFpbi9TUEhDLUItMjAyMS5SbWQpLgoKPGJyPgoKIyMjIDEuIExvYWQgUGFja2FnZXMKYGBge3IgZWNobz1UUlVFLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpsaWJyYXJ5KG5hbmlhcikKbGlicmFyeShkcGx5cikKbGlicmFyeShqb21vKQpsaWJyYXJ5KG1pdG1sKQpsaWJyYXJ5KGdncGxvdDIpCmBgYAoKIyMjIDIuIFNQSEMtQiAyMDIxCiMjIyMgMi4xLiBJbmNvbXBsZXRlIGRhdGFzZXQKYGBge3J9CmxvYWQoIi9Wb2x1bWVzL0xHQlQgUHJvamVjdCBkYXRhL011bHRpcGxlIEltcHV0YXRpb24vZF8yMDIxX2luY29tcGxldGUuUkRhdGEiKQpzdW1tYXJ5KCBkXzIwMjFfaW5jb21wbGV0ZSApCnNhcHBseSggZF8yMDIxX2luY29tcGxldGUsIGNsYXNzICkgIyBhbGwgY29udGludW91cyB2YXJpYWJsZXMgYXJlIG51bWVyaWMsIGFuZCBhbGwgY2F0ZWdvcmljYWwgdmFyaWFibGVzIGFyZSBmYWN0b3IKbWlzc192YXJfc3VtbWFyeSggZF8yMDIxX2luY29tcGxldGUgKSAjIDIuMiUgbWlzc2luZyBpbiBzZXh1YWxfaWRlbnRpdHlfMjAyMSwgMi4xJSBpbiBlZHVjYXRpb24sIGFuZCAwLjIlIGluIGluY29tZQpgYGAKCiMjIyMgMi4yLiBUd28tbGV2ZWwgbXVsdGl2YXJpYXRlIG5vcm1hbCBpbXB1dGF0aW9uCmBgYHtyfQojIHNwZWNpZnkgaW1wdXRhdGlvbiBtb2RlbAojIGZtbF9pbXBfMjAyMSA8LSBzZXh1YWxfaWRlbnRpdHlfMjAyMSArIGVkdWNhdGlvbiArIGluY29tZSB+IDEgKyBhZ2Uqc2V4ICsgY291bnRyeV9vZl9iaXJ0aCArIG1hcml0YWxfc3RhdHVzICsgKCAxIHwgd2VpZ2h0X3N0cmF0YSApCgojIGZpbmFsIGltcHV0YXRpb24gd2l0aCB0aGUgY2hvc2VuIG51bWJlciBvZiBpdGVyYXRpb25zCiMgaW1wX2ZpbmFsXzIwMjEgPC0gam9tb0ltcHV0ZSggZGF0YSA9IGRfMjAyMV9pbmNvbXBsZXRlLAojICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGZvcm11bGEgPSBmbWxfaW1wXzIwMjEsCiMgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcmFuZG9tLkwxID0gImZ1bGwiLAojICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIG4uYnVybiA9IDIwMDAsCiMgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbi5pdGVyID0gMTAwMCwKIyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBtID0gMjAsCiMgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc2VlZCA9IDEyMzQ1CiMgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgKSAjIHRvb2sgYXJvdW5kIDEgaG91ciBhbmQgYSBoYWxmCgpsb2FkKCIvVm9sdW1lcy9MR0JUIFByb2plY3QgZGF0YS9NdWx0aXBsZSBJbXB1dGF0aW9uL2ltcF9maW5hbF8yMDIxLlJEYXRhIikKc3VtbWFyeSggaW1wX2ZpbmFsXzIwMjEgKQpwbG90KCBpbXBfZmluYWxfMjAyMSwgdHJhY2UgPSAiYWxsIiwgcHJpbnQgPSAiYmV0YSIgKQpgYGAKCiMjIyMgMi4zLiBWYWxpZGF0ZSBpbXB1dGVkIGRhdGEKYGBge3J9CiMgZXh0cmFjdCBpbXB1dGVkIGRhdGFzZXRzCm9yaWdpbmFsX2RhdGFfMjAyMSA8LSBtaXRtbENvbXBsZXRlKCBpbXBfZmluYWxfMjAyMSwgcHJpbnQgPSAwICkgIyBleHRyYWN0IG9yaWdpbmFsIGluY29tcGxldGUgZGF0YXNldAppbXBsaXN0XzIwMjEgPC0gbWl0bWxDb21wbGV0ZSggaW1wX2ZpbmFsXzIwMjEsIHByaW50ID0gImFsbCIgKSAjIGV4dHJhY3QgYWxsIGltcHV0ZWQgZGF0YXNldHMKCm9yaWdpbmFsX2RhdGFfMjAyMSRpbXB1dGF0aW9uIDwtICIwIgphbGxfZGF0YV8yMDIxIDwtIGJpbmRfcm93cyggb3JpZ2luYWxfZGF0YV8yMDIxLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgYmluZF9yb3dzKCBpbXBsaXN0XzIwMjEsIC5pZCA9ICJpbXB1dGF0aW9uIiApICkgIyBtZXJnZSBkYXRhc2V0cwphbGxfZGF0YV8yMDIxJGltcHV0YXRpb24gPC0gYXMubnVtZXJpYyggYWxsX2RhdGFfMjAyMSRpbXB1dGF0aW9uICkKc3VtbWFyeSggYWxsX2RhdGFfMjAyMSApCgojIHNleHVhbCBpZGVudGl0eSBpbiAyMDIxCmdncGxvdCggYWxsX2RhdGFfMjAyMVsgIWlzLm5hKCBhbGxfZGF0YV8yMDIxJHNleHVhbF9pZGVudGl0eV8yMDIxICksIF0sCiAgICAgICAgYWVzKCBmaWxsID0gc2V4dWFsX2lkZW50aXR5XzIwMjEsIHggPSBpbXB1dGF0aW9uICkgKSArIAogIGdlb21fYmFyKCBwb3NpdGlvbiA9ICJmaWxsIiApICsgCiAgc2NhbGVfeV9jb250aW51b3VzKCBsYWJlbHMgPSBzY2FsZXM6OnBlcmNlbnQgKSArIAogIHNjYWxlX2ZpbGxfZGlzY3JldGUoIG5hbWUgPSAiU2V4dWFsIGlkZW50aXR5IGluIDIwMjEiICkgKwogIGxhYnMoCiAgICB4ID0gIkltcHV0YXRpb24gbnVtYmVyIiwKICAgIHkgPSAiUHJvcG9ydGlvbiIsCiAgICBjYXB0aW9uID0gIk5vdGVzOiBJbXB1dGF0aW9uIG51bWJlciAwIHJlcHJlc2VudHMgdGhlIG9yaWdpbmFsIGluY29tcGxldGUgZGF0YXNldC4iICkgKwogIHRoZW1lX2NsYXNzaWMoKSArCiAgdGhlbWUoIGF4aXMudGl0bGUueCA9IGVsZW1lbnRfdGV4dCggZmFtaWx5ID0gIkFyaWFsIiwgc2l6ZSA9IDExICksCiAgICAgICAgIGF4aXMudGV4dC54ID0gZWxlbWVudF90ZXh0KCBmYW1pbHkgPSAiQXJpYWwiLCBzaXplID0gMTEgKSwKICAgICAgICAgYXhpcy50ZXh0LnkgPSBlbGVtZW50X3RleHQoIGZhbWlseSA9ICJBcmlhbCIsIHNpemUgPSAxMSApLAogICAgICAgICBheGlzLnRpdGxlLnkgPSBlbGVtZW50X3RleHQoIGZhbWlseSA9ICJBcmlhbCIsIHNpemUgPSAxMSApLAogICAgICAgICBsZWdlbmQudGV4dCA9IGVsZW1lbnRfdGV4dCggZmFtaWx5ID0gIkFyaWFsIiwgc2l6ZSA9IDEwICksCiAgICAgICAgIGxlZ2VuZC50aXRsZSA9IGVsZW1lbnRfdGV4dCggZmFtaWx5ID0gIkFyaWFsIiwgc2l6ZSA9IDEwICksCiAgICAgICAgIGxlZ2VuZC5wb3NpdGlvbiA9ICJib3R0b20iLAogICAgICAgICBwbG90LmNhcHRpb24gPSBlbGVtZW50X3RleHQoIGZhbWlseSA9ICJBcmlhbCIsIHNpemUgPSAxMCwgaGp1c3QgPSAwICkgCiAgKQoKIyBlZHVjYXRpb24KZ2dwbG90KCBhbGxfZGF0YV8yMDIxWyAhaXMubmEoIGFsbF9kYXRhXzIwMjEkZWR1Y2F0aW9uICksIF0sCiAgICAgICAgYWVzKCBmaWxsID0gZWR1Y2F0aW9uLCB4ID0gaW1wdXRhdGlvbiApICkgKyAKICBnZW9tX2JhciggcG9zaXRpb24gPSAiZmlsbCIgKSArIAogIHNjYWxlX3lfY29udGludW91cyggbGFiZWxzID0gc2NhbGVzOjpwZXJjZW50ICkgKyAKICBzY2FsZV9maWxsX2Rpc2NyZXRlKCBuYW1lID0gIkxldmVsIG9mIGVkdWNhdGlvbiIgKSArCiAgbGFicygKICAgIHggPSAiSW1wdXRhdGlvbiBudW1iZXIiLAogICAgeSA9ICJQcm9wb3J0aW9uIiwKICAgIGNhcHRpb24gPSAiTm90ZXM6IEltcHV0YXRpb24gbnVtYmVyIDAgcmVwcmVzZW50cyB0aGUgb3JpZ2luYWwgaW5jb21wbGV0ZSBkYXRhc2V0LiIgKSArCiAgdGhlbWVfY2xhc3NpYygpICsKICB0aGVtZSggYXhpcy50aXRsZS54ID0gZWxlbWVudF90ZXh0KCBmYW1pbHkgPSAiQXJpYWwiLCBzaXplID0gMTEgKSwKICAgICAgICAgYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoIGZhbWlseSA9ICJBcmlhbCIsIHNpemUgPSAxMSApLAogICAgICAgICBheGlzLnRleHQueSA9IGVsZW1lbnRfdGV4dCggZmFtaWx5ID0gIkFyaWFsIiwgc2l6ZSA9IDExICksCiAgICAgICAgIGF4aXMudGl0bGUueSA9IGVsZW1lbnRfdGV4dCggZmFtaWx5ID0gIkFyaWFsIiwgc2l6ZSA9IDExICksCiAgICAgICAgIGxlZ2VuZC50ZXh0ID0gZWxlbWVudF90ZXh0KCBmYW1pbHkgPSAiQXJpYWwiLCBzaXplID0gMTAgKSwKICAgICAgICAgbGVnZW5kLnRpdGxlID0gZWxlbWVudF90ZXh0KCBmYW1pbHkgPSAiQXJpYWwiLCBzaXplID0gMTAgKSwKICAgICAgICAgbGVnZW5kLnBvc2l0aW9uID0gImJvdHRvbSIsCiAgICAgICAgIHBsb3QuY2FwdGlvbiA9IGVsZW1lbnRfdGV4dCggZmFtaWx5ID0gIkFyaWFsIiwgc2l6ZSA9IDEwLCBoanVzdCA9IDAgKSAKICApCgojIGluY29tZQpzdW1tYXJ5KCBhbGxfZGF0YV8yMDIxJGluY29tZSApCm5yb3coIGFsbF9kYXRhXzIwMjFbIGFsbF9kYXRhXzIwMjEkaW5jb21lIDwgMCAmICFpcy5uYSggYWxsX2RhdGFfMjAyMSRpbmNvbWUgKSwgXSApICMgIDE5NCBpbXB1dGVkIHZhbHVlcyBhcmUgbmVnYXRpdmUKYGBg